# Commands to start an interactive session on the JHPCE cluster
qrsh -l mem_free=20G,h_vmem=20G
module load conda_R
cd /fastscratch/myscratch/akuo/alsf-filbin
R
library(here)
dataset = "mrna" # options = "premrna_mrna", "mrna", or "premrna"
First, we create a table with information about where each file (quantified counts from salmon) is. This will be used to create a SummarizedExperiment object.
# list tumor names
tumor_names <- list.files(here("sample_data"))[
!grepl("*.txt", list.files(here("sample_data")))]
unique_sf_paths <- NULL
for(tum in tumor_names){
ids <- list.files(here("sample_data", tum))
ids <- unique(stringr::str_sub(ids, end=-13))
if(!identical(ids, character(0))){
ids <- here(paste0("salmon_quants_", dataset), paste0(ids, "_quant"), "quant.sf")
}
unique_sf_paths <- c(unique_sf_paths, ids)
}
unique_sf_ids <- NULL
for(tum in tumor_names){
ids <- list.files(here("sample_data", tum))
ids <- unique(stringr::str_sub(ids, end=-13))
unique_sf_ids <- c(unique_sf_ids, ids)
}
coldata <- data.frame(files = unique_sf_paths, names = unique_sf_ids)
We also need to use the linkedTxome object to use tximeta properly, i.e. rowRanges(se) won’t be NULL and tximeta will be able to match the transcripts to the genes for us. Note: still doesn’t work
suppressPackageStartupMessages({
library(tximeta)
library(BiocFileCache)
})
# check if linkedTxome is already in the cache
bfcloc <- getTximetaBFC()
bfc <- BiocFileCache(bfcloc)
bfcinfo(bfc)
# if not, load linkedTxome json file
json_file <- here("salmon_files", "gencode.v32_salmon-index-v1.0.0.json")
loadLinkedTxome(json_file)
The only way I’ve figured out how to make this work for now is with skipMeta = TRUE.
se_file_name = here(paste0("salmon_quants_", dataset), paste0("se_", dataset, ".rds"))
# coldata = coldata[1:2, ] # for testing
if(!file.exists(se_file_name)){
# Create SummarizedExperiment object
if(dataset == "premrna")
se <- tximeta(coldata, skipMeta = TRUE) # Takes a couple of minutes, file size = 800 MB
else if (dataset == "mrna")
se <- tximeta(coldata) # run this line if you used mRNA transcripts in your index only, it will automatically detect the right transcriptome
# se <- tximeta(coldata, ignoreAfterBar = TRUE)
saveRDS(se, se_file_name)
} else {
se = readRDS(se_file_name) # Takes a couple of seconds
}
suppressPackageStartupMessages({
library(SummarizedExperiment)
library(DESeq2)
library(scater)
})
colData(se)
assayNames(se)
rowRanges(se)
dat = assay(se, "counts")
A SingleCellExperiment class is derived from the SummarizedExperiment class. The most important change is the addition of a new slot called reducedDims. Read more here.
# BiocManager::install('SingleCellExperiment')
library(SingleCellExperiment)
sce = SingleCellExperiment(assays = list(counts = dat))
# you can access counts by assay(sce, "counts") or counts(sce)
# you can add a new entry to assays slot by assay(sce, "counts_new") = dat_new
# This will take a long time (> 45 min); should probably separate out as a script
tic()
sce <- getBMFeatureAnnos(sce,
ids = rownames(sce),
filters = "ensembl_gene_id",
attributes = c("ensembl_gene_id", "hgnc_symbol",
"gene_biotype"),
biomart = "ENSEMBL_MART_ENSEMBL",
dataset = "hsapiens_gene_ensembl",
host = "www.ensembl.org")
toc()
rowData(sce)$gene_name_unique <-
uniquifyFeatureNames(rowData(sce)$gene_name_ensembl,
rowData(sce)$hgnc_symbol)
There are a couple of QC metrics to identify low-quality cells:
low_lib_size or (b) few expressed endogeneous genes (nonzero counts for those genes) low_n_featuresI will only do (1) for now.
library(scater)
# Compute quality control metrics:
# sum is the total count for each cell
# detected contains the number of detected genes (actually transcripts for our data)
df = perCellQCMetrics(sce)
df
# Find outliers with low library sizes (LibSize) and few detected features (n_features)
reasons = quickPerCellQC(df) # DataFrame of logical values
colSums(as.matrix(reasons))
# Discard outliers
filtered = sce[, !reasons$discard]
dat_filtered = counts(filtered) # 226608 x 1018 for mrna, 221988 x 1090 for premrna
sce_filtered = SingleCellExperiment(assays = list(counts = dat_filtered))
filtered_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_filtered_", dataset, ".rds"))
saveRDS(sce_filtered, filtered_file_name)
Diagnostic plots: https://osca.bioconductor.org/quality-control.html#quality-control-plots
library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
dataset = "mrna" # options = "premrna_mrna", "mrna", or "premrna"
Run compute-mle.sh to get mle sig (“shape”) parameter first. There will be a parameter for every cell. It will take about 30 minutes to an hour.
# Load filtered counts
filtered_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_filtered_", dataset, ".rds"))
if(file.exists(filtered_file_name)){
sce_filtered = readRDS(filtered_file_name)
dat_filtered = counts(sce_filtered)
}
# Read in mle parameters
mle_file_names = list.files(here(paste0("mle_results_", dataset)), full.names = TRUE)
mle_results = lapply(mle_file_names, readRDS)
sig_vec = sapply(mle_results, function(r) r["sig"])
mu_vec = sapply(mle_results, function(r) r["mu"])
# Plot sig ("shape") distribution
ggplot(tibble(sig = sig_vec), aes(x = sig)) +
geom_histogram(bins = 30) +
labs(title = "Distribution of sig parameter",
x = "sig",
y = "Number of cells")
source(here("scripts", "quminorm.R"))
# Convert to pseudo-UMIs
umi_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_umi_", dataset, ".rds"))
if(file.exists(umi_file_name)){
dat_umi = readRDS(umi_file_name)
} else {
dat_umi = quminorm_matrix(dat_filtered, shape = median(sig_vec)) # Takes several minutes (20~30 min). In the paper, they use the mode, but I will use the median since it is not clear what mode means for continuous values. The median for mRNA is 3.16. The median for pre-mRNA is 3.28.
dat_umi = dat_umi[!(rowSums(dat_umi) == 0), ] # Remove transcripts with all zeros
sce_umi = SingleCellExperiment(assays = list(counts = dat_umi))
saveRDS(sce_umi, umi_file_name)
}
umi_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_umi_", dataset, ".rds"))
sce_umi = readRDS(umi_file_name)
dat_umi = counts(sce_umi)
# Counts by transcript
dim(dat_umi) # 192459 x 1018 for mRNA, 186368 x 1090 for premRNA
# Read in mrna data to look up genes
se_mrna = readRDS(here(paste0("salmon_quants_mrna"), paste0("se_mrna.rds")))
transcripts_mrna = rownames(assay(se_mrna, "counts"))
row_ranges = rowRanges(se_mrna)
genes = unlist(elementMetadata(row_ranges)$gene_id)
mrna_gene_matching_dt = tibble(transcript = transcripts_mrna, genes)
if(dataset == "mrna"){
matched_index = match(rownames(dat_umi), mrna_gene_matching_dt$transcript)
dat_gene = rowsum(dat_umi, group = mrna_gene_matching_dt$genes[matched_index], reorder = FALSE)
dim(dat_gene) # 52142 x 1018 for mRNA
} else if(dataset == "premrna") {
# Get genes for every premrna transcript
# 171987 in intersection of premRNA and mRNA transcripts
# 20472 mRNA transcripts did not have an premRNA counterpart
# 14381 premRNA transcripts did not have a mRNA counterpart
# Note some transcripts were dropped in the pseudo-UMI conversion step (when removing transcripts with zero counts). If including those,
# 221685 in intersection
# 4923 mRNA transcripts did not have a premRNA counterpart
# 303 premRNA transcripts did not have a mRNA counterpart
transcripts_premrna = rownames(dat_umi)
transcripts_premrna_converted = gsub(".{8}$", "" , transcripts_premrna)
matched_index = match(transcripts_premrna_converted, mrna_gene_matching_dt$transcript)
dat_gene = rowsum(dat_umi, group = mrna_gene_matching_dt$genes[matched_index], reorder = FALSE)
# Remove counts for unmatched transcripts
dat_gene = dat_gene[!is.na(rownames(dat_gene)), ]
dim(dat_gene) # 53006 x 1090
}
gene_file_name = here(paste0("salmon_quants_", dataset), paste0("sce_gene_", dataset, ".rds"))
sce_gene = SingleCellExperiment(assays = list(counts = dat_gene))
saveRDS(sce_gene, gene_file_name)
library(here)
library(ggplot2)
library(dplyr)
library(tidyr)
library(SingleCellExperiment)
## Warning: package 'SummarizedExperiment' was built under R version 3.6.2
## Warning: package 'S4Vectors' was built under R version 3.6.2
## Warning: package 'IRanges' was built under R version 3.6.2
## Warning: package 'DelayedArray' was built under R version 3.6.2
## Warning: package 'BiocParallel' was built under R version 3.6.2
library(scattermore)
library(tictoc)
sce_prem = readRDS(here(paste0("salmon_quants_premrna"), "sce_gene_premrna.rds"))
sce_mrna = readRDS(here(paste0("salmon_quants_mrna"), "sce_gene_mrna.rds"))
dat_prem = counts(sce_prem)
dat_mrna = counts(sce_mrna)
dat_prem_rowmeans = tibble(genes = rownames(dat_prem),
mean_prem = rowMeans(dat_prem))
dat_mrna_rowmeans = tibble(genes = rownames(dat_mrna),
mean_mrna = rowMeans(dat_mrna))
dat_rowmeans = full_join(dat_prem_rowmeans, dat_mrna_rowmeans, by = "genes")
tic()
dat_rowmeans %>%
ggplot(aes(x = log(mean_mrna),
y = log(mean_prem))) +
geom_point(alpha = 0.1) +
geom_abline(intercept = 0, slope = 1, color = "red") +
labs(x = "Log(average gene expression with mRNA)",
y = "Log(average gene expression with pre-mRNA)") +
theme_bw()
## Warning: Removed 3592 rows containing missing values (geom_point).
toc() # 2.5 sec
## 2.424 sec elapsed
# Smoothscatter
tic()
smoothScatter(x = log(dat_rowmeans$mean_mrna),
y = log(dat_rowmeans$mean_prem),
xlab = "Log(average gene expression with mRNA)",
ylab = "Log(average gene expression with pre-mRNA)")
toc() # 1 sec
## 0.347 sec elapsed
# Scattermoreplot version
tic()
d = dat_rowmeans %>%
mutate(log_mean_prem = log(mean_prem),
log_mean_mrna = log(mean_mrna)) %>%
select(log_mean_prem, log_mean_mrna)
d = d[complete.cases(d), ] %>% as.matrix()
scattermoreplot(d,
main='Scattermore version')
toc() # 0.5 sec
## 0.161 sec elapsed
MA plot is a plot of log-intensity ratios (M) vs log-intensity averages (A)
dat_rowmeans = dat_rowmeans %>%
mutate(m = log(mean_prem) - log(mean_mrna),
a = (log(mean_prem) + log(mean_mrna)/2))
# ggplot geom_point
dat_rowmeans %>%
ggplot(aes(x = a,
y = m)) +
geom_point(alpha = 0.1) +
geom_smooth() +
geom_hline(yintercept = 0, color = "red") +
labs(x = "A",
y = "M") +
theme_bw()
## Warning: Removed 3592 rows containing non-finite values (stat_smooth).
## Warning: Removed 3592 rows containing missing values (geom_point).
# ggplot smoothscatter version
dat_rowmeans %>%
ggplot(aes(x = a,
y = m)) +
stat_density2d(aes(fill = ..density..^0.3), geom = "tile", contour = FALSE, n = 200) +
scale_fill_continuous(low = "white", high = "#1C5E7E") +
geom_hline(yintercept = 0, color = "red") +
geom_smooth() +
labs(x = "A",
y = "M") +
theme_bw()
## Warning: Removed 3592 rows containing non-finite values (stat_density2d).
## Warning: Removed 3592 rows containing non-finite values (stat_smooth).
# smoothscatter version
smoothScatter(x = dat_rowmeans$a, y = dat_rowmeans$m,
xlab = "A",
ylab = "M")
Plot for one cell only.
dat_prem_subset = as_tibble(dat_prem[, 1:5]) %>%
rename_all(function(x) paste0(x, "_premrna")) %>%
mutate(genes = rownames(dat_prem))
dat_mrna_subset = as_tibble(dat_mrna[, 1:5]) %>%
rename_all(function(x) paste0(x, "_mrna")) %>%
mutate(genes = rownames(dat_mrna))
dat_subset = full_join(dat_prem_subset, dat_mrna_subset, by = "genes")
dat_subset %>%
mutate(m = log(`BT1179Nuc-P1-A01_premrna`) - log(`BT1179Nuc-P1-A01_mrna`),
a = (log(`BT1179Nuc-P1-A01_premrna`) + log(`BT1179Nuc-P1-A01_mrna`)/2)) %>%
ggplot(aes(x = a,
y = m)) +
stat_density2d(aes(fill = ..density..^0.3), geom = "tile", contour = FALSE, n = 200) +
scale_fill_continuous(low = "white", high = "#1C5E7E") +
geom_hline(yintercept = 0, color = "red") +
geom_smooth() +
geom_point(alpha = 0.1) +
labs(x = "A",
y = "M") +
theme_bw()
## Warning: Removed 51526 rows containing non-finite values (stat_density2d).
## Warning: Removed 51526 rows containing non-finite values (stat_smooth).
## Warning: Removed 42769 rows containing missing values (geom_point).
First, I make all the column sums the same by scaling. The count in row \(i\) and column \(j\) is transformed as \(x_{ij} = x_{ij}*\sum_i x_{ij}/median_j(\sum_i x_{ij})\).
summary(colSums(dat_prem))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1981 14825 36103 44062 61583 351921
# Apply transformation
n = round(median(colSums(dat_prem)))
dat_prem = apply(dat_prem, MARGIN = 2, function(x) x*n/sum(x))
summary(colSums(dat_prem))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 36103 36103 36103 36103 36103 36103
I plot the mean and variance for every row (transcript). Under a poisson distribution, they should be the same.
plot_meanvar = function(dat_sub){
# estimate lambdas and variances for every transcript
means = rowMeans(dat_sub)
vars = apply(dat_sub, MARGIN = 1, var)
# variance = mean under Poisson distribution
tibble(means, vars) %>%
ggplot(aes(x = log(means), y = log(vars))) +
geom_point(alpha = 0.4) +
geom_abline(intercept = 0, slope = 1)
}
plot_meanvar(dat_prem) +
labs(title = "pre-mRNA")
I first compute the average expression for every row (x-axis) \(\hat{\mu}_i\) and the empirical \(P(X_i=0)\), which is the probability that for a given transcript \(i\), the count is 0.
I then compute what \(P(X_i=0)\) would be under the model assumptions of binomial or poisson, using parameters estimated from the data. In particular, for a \(Binom(n, p_i)\), \(n\) is the median number of total counts of cells and \(p_i\) is the mean proportion of counts that were in gene \(i\).
# Function to plot P(X_i = 0) against average expression level mu_i
plot_prob = function(dat_sub){
n = round(median(colSums(dat_sub)))
means = rowMeans(dat_sub)
vars = apply(dat_sub, MARGIN = 1, var)
# empirical P(X_i = 0)
emp_probs_0 = apply(dat_sub, MARGIN = 1, function(r) sum(r==0)/ncol(dat_sub))
plot_dt = tibble(means, emp_probs_0)
emp_props = rowMeans(dat_sub/colSums(dat_sub))
# Model P(X_i = 0) under Binomial
binom_probs_0 = dbinom(x = 0, size = n, prob = emp_props)
# Model P(X_i = 0) under Poisson
poiss_probs_0 = dpois(x = 0, lambda = n*emp_props)
# Model P(X_i = 0) under Negative Binomial
# Estimate size/dispersion parameter
model = lm(vars ~ 1*means + I(means^2) + 0, tibble(means, vars))
phi = 1/coef(model)["I(means^2)"]
nbinom_probs_0 = dnbinom(x = 0, size = phi, mu = n*emp_props)
# Tibble for plot
plot_lines_dt = tibble(means = means,
binomial = binom_probs_0,
poisson = poiss_probs_0,
nbinomial = nbinom_probs_0) %>%
pivot_longer(-means, names_to = "model", values_to = "probs_0")
# Plot
plt = plot_lines_dt %>%
ggplot(aes(x = log(means), y = probs_0)) +
geom_point(data = plot_dt, aes(x = log(means), y = emp_probs_0), alpha = 0.4) + # Add data points
geom_line(aes(color = model),
size = 1.5) + # Add lines for models
labs(x = "Average expression level log(E(X_i))",
y = "P(X_i = 0)") +
theme(text = element_text(size = 15))
# Return object
out = list(plot = plt,
lines_dt = plot_lines_dt)
return(out)
}
# Plot P(X_i = 0) against average expression level
prob_out_prem = plot_prob(dat_prem)
prob_out_prem$plot +
labs(title = "pre-mRNA")
summary(colSums(dat_mrna))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1695 22260 47320 55303 77240 440616
# Apply transformation
n = round(median(colSums(dat_mrna)))
dat_mrna = apply(dat_mrna, MARGIN = 2, function(x) x*n/sum(x))
summary(colSums(dat_mrna))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47320 47320 47320 47320 47320 47320
# Plot mean against variance
plot_meanvar(dat_mrna) +
labs(title = "mRNA")
# Plot P(X_i = 0) against average expression level
prob_out_mrna = plot_prob(dat_mrna)
prob_out_mrna$plot +
labs(title = "mRNA")
library(scran)
## Warning: package 'scran' was built under R version 3.6.2
library(scater)
library(BiocSingular)
## Warning: package 'BiocSingular' was built under R version 3.6.2
# library(factoextra)
library(tictoc)
library(glmpca)
source(here("scripts", "functions_genefilter.R"))
sce = sce_prem
dat = dat_prem
# Normalize and take log
clust = quickCluster(sce)
table(clust)
## clust
## 1 2 3 4 5 6 7
## 118 202 169 131 163 100 207
sce = computeSumFactors(sce, clusters = clust)
sce = logNormCounts(sce)
# Filter out highly variant genes
# filterHVG(sce)
set.seed(1)
tic("approx PCA")
sce = runPCA(sce, exprs_values = "logcounts", ntop = ncol(sce), BSPARAM = IrlbaParam())
pc = list(x = reducedDim(sce, "PCA"))
toc()
## approx PCA: 2.162 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(t(X), rank = 5, BSPARAM = RandomParam())
# toc()
# Run PCA with prcomp (~10 minutes)
# X = t(X)
# nonzero_var_cols = which(apply(X, 2, var) != 0) # Remove columns (transcripts) with zero variance
# X = X[, nonzero_var_cols]
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
ggplot(aes(PC4, PC5, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
sce = sce_mrna
dat = dat_mrna
# Normalize and take log
clust = quickCluster(sce)
## Warning in regularize.values(x, y, ties, missing(ties)): collapsing to unique
## 'x' values
table(clust)
## clust
## 1 2 3 4 5
## 184 135 239 210 250
sce = computeSumFactors(sce, clusters = clust)
sce = logNormCounts(sce)
set.seed(1)
tic("approx PCA")
sce = runPCA(sce, exprs_values = "logcounts", BSPARAM = IrlbaParam())
pc = list(x = reducedDim(sce, "PCA"))
toc()
## approx PCA: 1.632 sec elapsed
# set.seed(1)
# Run random PCA (2 minutes)
# tic("random PCA")
# pc = runPCA(t(X), rank = 5, BSPARAM = RandomParam())
# toc()
# Run PCA with prcomp (~10 minutes)
# X = t(X)
# nonzero_var_cols = which(apply(X, 2, var) != 0) # Remove columns (transcripts) with zero variance
# X = X[, nonzero_var_cols]
# pc = prcomp(X, center = T, scale = F)
# % variance explained by each PC
# fviz_eig(pc)
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = pc$x[,1], PC2 = pc$x[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = pc$x[,1], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = pc$x[,2], PC3 = pc$x[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC4 = pc$x[,4], PC5 = pc$x[,5], color = tumor_labels) %>%
ggplot(aes(PC4, PC5, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
sce = sce_prem
dat = dat_prem
X = dat
# Takes a long time, may want to try on cluster
# Time increases with number of rows and number of latent dimensions ("L")
tic("glm PCA")
glmpc = glmpca(X, L = 3, fam = c("poi"))
toc()
tumor_labels = gsub("-.*$", "", colnames(sce))
# Plot PC
data.frame(PC1 = glmpc$factors[,1], PC2 = glmpc$factors[,2], color = tumor_labels) %>%
ggplot(aes(PC1, PC2, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC1 = glmpc$factors[,1], PC3 = glmpc$factors[,3], color = tumor_labels) %>%
ggplot(aes(PC1, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()
data.frame(PC2 = glmpc$factors[,2], PC3 = glmpc$factors[,3], color = tumor_labels) %>%
ggplot(aes(PC2, PC3, color = color)) +
geom_point(alpha = 0.5) +
theme_bw()